import pandas as pd



data = pd.read_excel(r"日本收入支出数据整理.xlsx",header=0, index_col=0, sheet_name=None)

data = {k[:4]:v for k,v in data.items()}


def extract_data(data, year):
    print(year)
    data20 = data[year]
    data20.index = data20['中文翻译']
    data20 = data20.loc[:, ['Ⅰ.1',
                            'Ⅱ.1',
                            'Ⅲ.1',
                            'Ⅳ.1',
                            'Ⅴ.1',
                            'Ⅵ',
                            'Ⅶ',
                            'Ⅷ',
                            'Ⅸ',
                            'Ⅹ',]
                        ]

    data20 = data20.iloc[:-45,:]

    filter_idx = [
        '食品', '居住', '水电燃气费', '家具・家务用品', '服装及鞋类', '保健医疗', '交通、通信', '教育', '教育娱乐', '其他消费支出'
    ]

    data20_filter = data20.loc[filter_idx, :]
    data20_filter.columns = [f"第{i}组" for i in range(10, 0, -1)]
    return data20_filter

def remove_dup_idx(df):
    idx_raw = df.index
    idxs = []
    filter_i = []
    for i, idx in enumerate(idx_raw):
        if idx not in idxs:
            idxs.append(idx)
            filter_i.append(i)
    df_out = df.iloc[filter_i, :]
    return df_out


ext_data = {year:extract_data(data, year) for year in data.keys()}
ext_data = {year:remove_dup_idx(df) for year, df in ext_data.items()}


li = []
for year, df in ext_data.items():
    df['year'] = year
    li.append(df)

df_out = pd.concat(li, axis=0)

df_out.to_excel(r"日本收入支出数据整理5.xlsx")